/* * Tanaguru - Automated webpage assessment * Copyright (C) 2008-2015 Tanaguru.org * * This file is part of Tanaguru. * * Tanaguru is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * * Contact us by mail: tanaguru AT tanaguru DOT org */ package org.tanaguru.contentadapter.html; import org.jsoup.Jsoup; import org.jsoup.nodes.Attribute; import org.jsoup.nodes.Document; import org.jsoup.nodes.Entities; import org.jsoup.nodes.Node; import org.tanaguru.contentadapter.HTMLCleaner; /** * * @author jkowalczyk */ public class HTMLJsoupCleanerImpl extends AbstractHTMLCleaner implements HTMLCleaner { private static final String EMPTY_NS_DEFINITION_PATTERN = "xmlns=\"(\\s)*\""; private static final String NS_TAG_OPEN_PREFIX_DEFINITION_PATTERN = "<a[0-9]+:"; private static final String NS_TAG_CLOSURE_PREFIX_DEFINITION_PATTERN = "</a[0-9]+:"; static final String CORRECTOR_NAME = "JsoupCleaner"; public HTMLJsoupCleanerImpl() { super(); } @Override public void run() { dirtyHTML = removeBadNamespaceDefinition(dirtyHTML); Document doc = Jsoup.parse(dirtyHTML); doc.outputSettings().escapeMode(Entities.EscapeMode.xhtml); doc.outputSettings().outline(true); doc.outputSettings().indentAmount(2); removeComments(doc); removeMalformedAttributes(doc); result = doc.outerHtml(); } /** * Remove the comments of the page * * @param node */ private void removeComments(Node node) { // as we are removing child nodes while iterating, we cannot use a normal foreach over children, // or will get a concurrent list modification error. int i = 0; while (i < node.childNodes().size()) { Node child = node.childNode(i); if (child.nodeName().equals("#comment")) child.remove(); else { removeComments(child); i++; } } } /** * Remove the comments of the page * * @param node */ private void removeMalformedAttributes(Node node) { // as we are removing child nodes while iterating, we cannot use a normal foreach over children, // or will get a concurrent list modification error. int i = 0; while (i < node.childNodes().size()) { Node child = node.childNode(i); for (Attribute attr : child.attributes()) { if (attr.getKey().startsWith("\"") && attr.getKey().endsWith("\"")) { child.removeAttr(attr.getKey()); } } removeMalformedAttributes(child); i++; } } /** * Webdriver may return some html with namespace prefixed tag. This method * provides a clean operation on the source code, to enable its parse with * Jsoup * @param dirtyHTML * @return */ private String removeBadNamespaceDefinition(String dirtyHTML) { return dirtyHTML.replaceAll(EMPTY_NS_DEFINITION_PATTERN, "") .replaceAll(NS_TAG_OPEN_PREFIX_DEFINITION_PATTERN, "<") .replaceAll(NS_TAG_CLOSURE_PREFIX_DEFINITION_PATTERN, "</"); } @Override public String getCorrectorName() { return CORRECTOR_NAME; } }